Clustering ASVs

Code
library(dplyr) ; library(tidyr) ; library(ggplot2) 
library(doParallel) ; library(foreach) ; library(doSNOW)

root <- rprojroot::has_file(".git/index")
datadir = root$find_file("data")
funsdir = root$find_file("functions")
savingdir = root$find_file("saved_files")

datapath = root$find_file(paste0(datadir,'/','grump.phaeocystis_asv_long.csv'))
files_vec <- list.files(funsdir)
currentwd <- getwd()

for( i in 1:length(files_vec)){
  source(root$find_file(paste0(funsdir,'/',files_vec[i])))
}

getwd()
[1] "/Users/rafaelcatoia/Desktop/repos/clustersPhaeo/phaeoClusters/saved_files"
Code
funcitons_parallel = c("clusterize_ward_pam",
                       "eval_adist_clustering",
                       "inducedDist",
                       "summarise_adist_clustering",
                       "tidy_grump")

dframe = data.table::fread(input = datapath)
dframe = tidy_grump(Dframe = dframe)

The idea is to create a ‘custom’ distance matrix, to induce the grouppings.

  • \(c_1\) is within the same species - but no unassigned
  • \(c_2\) is between assigned species
  • \(c_3\) is between species and unassigned
  • \(c_4\) is within unassigned and unassigned

For this document we will use only the following configuration:

\(c_1=1\), \(c_2=1000\), \(c_3=10\) , \(c_4=10\)

A priori, the ’probability` of unassigned ASVs to agglomerate between themselves is the same as if it was to agglomerate with other species. Large \(c_2\) makes it harder for known species ASVs to group between themselves.

Code
inducedDist_ = inducedDist(
  dFrame = dframe$dframe,
  c1 = 1,c2=1000,c3=10,c4=10,
  compMatrix = dframe$ASV_composition)

#inducedDist$myDist_checking

\[ S = \frac{numerator}{denominator} \]

Where:

\[ numerator = \frac{1}{k} \left( \sum_{j=1}^{k} \left[ \frac{1}{n_j}\sum_{i}^{n_j} d(x_i-\bar x^{(j)} ) \right] \right) \]

The mean average distance within clusters (each observation and the medoid of the cluster)

\[ denominator = \frac{1}{\binom{k}{2}} \sum_{i \neq j } d( \bar x^{(i)}-\bar x^{(j)} ) \]

The average distance between clusters (each cluster is represented by it’s medoid)

\[ S = \frac{numerator}{denominator} \]

Where:

\[ SS_T = \frac{1}{N} \sum_{i=1}^{N-1} \sum_{j=i+1}^{N-1} d^2_{ij} \]

Being \(d^2_{ij}\) is the squared distance between objects \(i\) and \(j\).

\[ SS_W = \frac{1}{n} \sum_{i=1}^{N-1} \sum_{j=i+1}^{N-1} d^2_{ij}\delta^2_{ij} \]

\(\delta^2_{ij}\) is an indicator function (1 if obs\(i\) and \(j\) are from the same cluster, 0 otherwise). \(SS_A\) Then difference between the overall and the within groups sum-of-squares (\(SS_{A}=SS_{T}-SS_{W}\)).

\[ F = \frac{\frac{SS_{A}}{p-1}}{\frac{SS_{W}}{N-p}} \]

The idea of the new metric is: what if we represent the heterogeneity of a cluster by its maximum distance. Therefore, the average maximum distance within tends to decrease as the number of cluster increases.

In counterpart, let’s think the min distance between clusters as how different two clusters are. Thus, the average minimum distance between clusters also tends to decrease as the number of clusters increase.

Lets call this metric \(T\).

\[T = \frac{\frac{1}{k}\sum_{(i,j)\in k}\max(d(x_i,x_j))} {\frac{1}{k}\sum_{(i,j)\notin k}\min(d(x_i,x_j))}\]

Code
running=F

if(running){
  ## pure biotic factors = 
  list_with_clusters_alpha0 <- clusterize_ward_pam(
    distMatrix = inducedDist_$normalizedAitDist,
    maxnclust = 50,
    dFrameAsv = dframe$ASV_composition$name)
  
  alpha_=0.1
  
  distMatrix_ = alpha_*inducedDist_$normalizedMyDist+(1-alpha_)*inducedDist_$normalizedAitDist
  
  list_with_clusters_alpha0.10 <- clusterize_ward_pam(
    distMatrix = distMatrix_,
    maxnclust = 50,
    dFrameAsv = dframe$ASV_composition$name)

  alpha_=0.25
  
  distMatrix_ = alpha_*inducedDist_$normalizedMyDist+(1-alpha_)*inducedDist_$normalizedAitDist
  
  list_with_clusters_alpha0.25 <- clusterize_ward_pam(
    distMatrix = distMatrix_,
    maxnclust = 50,
    dFrameAsv = dframe$ASV_composition$name)
  
  alpha_=0.5
  
  distMatrix_ = alpha_*inducedDist_$normalizedMyDist+(1-alpha_)*inducedDist_$normalizedAitDist
  
  list_with_clusters_alpha0.50 <- clusterize_ward_pam(
    distMatrix = distMatrix_,
    maxnclust = 50,
    dFrameAsv = dframe$ASV_composition$name)
  
  saveRDS(object = list_with_clusters_alpha0,file = paste0(savingdir,'/','list_with_clusters_alpha0'))
  saveRDS(object = list_with_clusters_alpha0.10,file = paste0(savingdir,'/','list_with_clusters_alpha0.10'))
  saveRDS(object = list_with_clusters_alpha0.25,file = paste0(savingdir,'/','list_with_clusters_alpha0.25'))
  saveRDS(object = list_with_clusters_alpha0.50,file = paste0(savingdir,'/','list_with_clusters_alpha0.50'))
}

list_with_clusters_alpha0 = readRDS(file = paste0(savingdir,'/','list_with_clusters_alpha0'))
list_with_clusters_alpha0.10 = readRDS(file = paste0(savingdir,'/','list_with_clusters_alpha0.10'))
list_with_clusters_alpha0.25 = readRDS(file = paste0(savingdir,'/','list_with_clusters_alpha0.25'))
list_with_clusters_alpha0.50 = readRDS(file = paste0(savingdir,'/','list_with_clusters_alpha0.50'))
Code
running = F

if(running){
  nclusters <- ncol(list_with_clusters_alpha0$dFrameAsv_hclust)-1
  list_results <- list()
  
  for( i in 1:nclusters){
    list_results[[i]] <-  eval_adist_clustering(compositionDF = left_join(
      list_with_clusters_alpha0$dFrameAsv_hclust[,c(1,i+1)] %>% rename('name'=1,'Cluster'=2),
      dframe$ASV_composition))
    cat('iteration------:',i,'\n')
  }
  saveRDS(list_results,file = paste0(savingdir,'/','list_results_eval_alpha0_hclust'))
}

list_results_eval_alpha0_hclust = readRDS(paste0(savingdir,'/','list_results_eval_alpha0_hclust'))

summarise_adist_clustering(list_results_eval_alpha0_hclust) %>%
  reshape2::melt(id.vars = "nclust") %>% 
  filter(nclust > 1) %>% 
  ggplot(aes(x = nclust, y = value, group = variable)) +
    geom_line() +
    theme_minimal(base_size = 12) +
    facet_wrap(~variable, scales = "free_y")

Code
running =F

if(running){
  nclusters <- ncol(list_with_clusters_alpha0$dFrameAsv_pam)-1
  list_results <- list()
  
  for( i in 1:nclusters){
    list_results[[i]] <-  eval_adist_clustering(compositionDF = left_join(
      list_with_clusters_alpha0$dFrameAsv_pam[,c(1,i+1)] %>% rename('name'=1,'Cluster'=2),
      dframe$ASV_composition))
    cat('iteration------:',i,'\n')
  }
  saveRDS(list_results,file = paste0(savingdir,'/','list_results_eval_alpha0_pam'))
}

list_results_eval_alpha0_pam = readRDS(paste0(savingdir,'/','list_results_eval_alpha0_pam'))

summarise_adist_clustering(list_results_eval_alpha0_pam) %>%
  reshape2::melt(id.vars = "nclust") %>% 
  filter(nclust > 1) %>% 
  ggplot(aes(x = nclust, y = value, group = variable)) +
    geom_line() +
    theme_minimal(base_size = 12) +
    facet_wrap(~variable, scales = "free_y")

Here you will only find the scripts to generate the results

Code
############# -- arranging data

############# Finding K -------------------------------------------------------------------
running = F
if (running){
set.seed(1234)
alpha0_bootstrap = finding_optimal_k_parallel(
  dframe_asv = dframe$dframe %>% select(SampleKey,ASV_name,Raw.Sequence.Counts),
  B = 500,split_pct = 0.5,maxnclust_ = 50,vec_functions_fromEnv = funcitons_parallel)
  saveRDS(alpha0_bootstrap,file = paste0(savingdir,'/','alpha0_bootstrap'))
}

############# Finding K -------------------------------------------------------------------
Code
############# Permanova -------------------------------------------------------------------
running = F
if(running){
perm_alpha0 = permanova_custom(
  list_with_clusters = list_with_clusters_alpha0,
  ait_distMatrix = inducedDist_$normalizedAitDist,
  B = 500)
saveRDS(perm_alpha0,file = paste0(savingdir,'/','perm_alpha0'))
}

running = F
if(running){
perm_alpha0.10 = permanova_custom(
  list_with_clusters = list_with_clusters_alpha0.10,
  ait_distMatrix = inducedDist_$normalizedAitDist,
  B = 500)
saveRDS(perm_alpha0.10,file = paste0(savingdir,'/','perm_alpha0.10'))
}

running = F
if(running){
perm_alpha0.25 = permanova_custom(
  list_with_clusters = list_with_clusters_alpha0.25,
  ait_distMatrix = inducedDist_$normalizedAitDist,
  B = 500)
saveRDS(perm_alpha0.25,file = paste0(savingdir,'/','perm_alpha0.25'))
}

running = F
if(running){
perm_alpha0.50 = permanova_custom(
  list_with_clusters = list_with_clusters_alpha0.50,
  ait_distMatrix = inducedDist_$normalizedAitDist,
  B = 500)
saveRDS(perm_alpha0.50,file = paste0(savingdir,'/','perm_alpha0.50'))
}
Code
## arranging the results

############# Finding K -------------------------------------------------------------------
alpha0_cv = readRDS(paste0(savingdir,'/','alpha0_bootstrap'))
alpha0_cv_df = data.table::rbindlist(alpha0_cv) %>% data.frame()



############# Permanova -------------------------------------------------------------------
perm_alpha <- readRDS(paste0(savingdir,'/','perm_alpha0'))

df_permanova <- plyr::ldply(perm_alpha$hclust, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")

df_permanova2 <- plyr::ldply(perm_alpha$pam, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")


df_perm_alpha0 = bind_rows(
  df_permanova %>% mutate(method='hclust'),
  df_permanova2 %>% mutate(method='pam'),
) %>% mutate(alpha=0)

########### 0.1 

perm_alpha <- readRDS(paste0(savingdir,'/','perm_alpha0.10'))

df_permanova <- plyr::ldply(perm_alpha$hclust, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")

df_permanova2 <- plyr::ldply(perm_alpha$pam, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")


df_perm_alpha0.10 = bind_rows(
  df_permanova %>% mutate(method='hclust'),
  df_permanova2 %>% mutate(method='pam'),
) %>% mutate(alpha=0.10)

########### 0.25
perm_alpha <- readRDS(paste0(savingdir,'/','perm_alpha0.25'))

df_permanova <- plyr::ldply(perm_alpha$hclust, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")

df_permanova2 <- plyr::ldply(perm_alpha$pam, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")

df_perm_alpha0.25 = bind_rows(
  df_permanova %>% mutate(method='hclust'),
  df_permanova2 %>% mutate(method='pam'),
) %>% mutate(alpha=0.25)

########### 0.50
perm_alpha <- readRDS(paste0(savingdir,'/','perm_alpha0.50'))

df_permanova <- plyr::ldply(perm_alpha$hclust, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")

df_permanova2 <- plyr::ldply(perm_alpha$pam, function(el){
  return(data.frame(nclust = el$nclust[1], pvalue = el$`Pr(>F)`[1], r2 = el$R2[1]))
}, .id = "nclust")

df_perm_alpha0.50 = bind_rows(
  df_permanova %>% mutate(method='hclust'),
  df_permanova2 %>% mutate(method='pam'),
) %>% mutate(alpha=0.50)


df_perm  = bind_rows(df_perm_alpha0,df_perm_alpha0.10,df_perm_alpha0.25,df_perm_alpha0.50)

‘CV-like’ measurement was calculated at 500 iterations of:

  1. After the split

    • create the composition of ASVs in 50% of the samples
    • fit the clustering methods (hclust and pam) from k = 1 : 50
    • evaluate each cluster in the ‘test’ data
Code
# alpha0_cv_df %>% 
#   reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
#   filter(nclust > 1) %>% 
#   select(-iteration) %>% select(variable) %>% distinct() %>% pull()

group_summary = c('ratio_avg_within_between','ratio_avg_max_min')

group_permanova = c('Fstat','SSA','SSW')
group_S = c('ratio_avg_within_between','mean_avg_within','avg_between_medoids')

group_max = c('max_max_dist_within','avg_max_dist_within','max_min_dist_clust_i_other',
              'avg_min_dist_clust_i_other')
Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  filter(variable %in% group_summary) %>% 
  group_by(method,nclust,variable) %>% 
  summarise(avg=mean(value),stdev=sd(value)) %>% 
  ungroup() %>% 
  ggplot(aes(x = nclust, y = avg, group = method,
             color=method,fill=method)) +
  geom_line() +
  geom_ribbon(aes(ymin=avg-stdev, ymax=avg+stdev),alpha=0.5)+
  theme_minimal(base_size = 16) +
  facet_wrap(~variable, scales = "free_y")+
  theme(legend.position = 'bottom')

Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  filter(variable %in% group_permanova) %>% 
  group_by(method,nclust,variable) %>% 
  summarise(avg=mean(value),stdev=sd(value)) %>% 
  ungroup() %>% 
  ggplot(aes(x = nclust, y = avg, group = method,
             color=method,fill=method)) +
  geom_line() +
  geom_ribbon(aes(ymin=avg-stdev, ymax=avg+stdev),alpha=0.5)+
  theme_minimal(base_size = 16) +
  facet_wrap(~variable, scales = "free_y")+
  theme(legend.position = 'bottom')

Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  filter(variable %in% group_S) %>% 
  group_by(method,nclust,variable) %>% 
  summarise(avg=mean(value),stdev=sd(value)) %>% 
  ungroup() %>% 
  ggplot(aes(x = nclust, y = avg, group = method,
             color=method,fill=method)) +
  geom_line() +
  geom_ribbon(aes(ymin=avg-stdev, ymax=avg+stdev),alpha=0.5)+
  theme_minimal(base_size = 16) +
  facet_wrap(~variable, scales = "free_y")+
  theme(legend.position = 'bottom')

Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  filter(variable %in% group_max) %>% 
  group_by(method,nclust,variable) %>% 
  summarise(avg=mean(value),stdev=sd(value)) %>% 
  ungroup() %>% 
  ggplot(aes(x = nclust, y = avg, group = method,
             color=method,fill=method)) +
  geom_line() +
  geom_ribbon(aes(ymin=avg-stdev, ymax=avg+stdev),alpha=0.5)+
  theme_minimal(base_size = 16) +
  facet_wrap(~variable, scales = "free_y")+
  theme(legend.position = 'bottom')

Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  #mutate(nclust=factor(nclust,levels=1:number_clusters)) %>% 
  group_by(method,nclust,variable) %>% 
  #summarise(avg=mean(value),stdev=sd(value)) %>% 
  #ungroup() %>% 
  #group_by(method,nclust,variable) %>% 
  summarise(Freq_Problem = sum(is.infinite(value))) %>% 
  filter(Freq_Problem>0) %>% 
  knitr::kable() %>% kableExtra::kable_styling()
method nclust variable Freq_Problem
hclustD 2 ratio_avg_within_between 74
hclustD 2 ratio_avg_max_min 75
hclustD 3 ratio_avg_max_min 74
pam 2 ratio_avg_within_between 82
pam 2 ratio_avg_max_min 82
pam 3 ratio_avg_within_between 27
pam 3 ratio_avg_max_min 27
pam 4 ratio_avg_within_between 2
pam 4 ratio_avg_max_min 2
Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  #mutate(nclust=factor(nclust,levels=1:number_clusters)) %>% 
  group_by(method,nclust,variable) %>% 
  #summarise(avg=mean(value),stdev=sd(value)) %>% 
  #ungroup() %>% 
  #group_by(method,nclust,variable) %>% 
  summarise(Freq_Problem = sum(is.infinite(value))) %>% 
  filter(Freq_Problem>0) %>% 
  knitr::kable() %>% kableExtra::kable_styling()
method nclust variable Freq_Problem
hclustD 2 ratio_avg_within_between 74
hclustD 2 ratio_avg_max_min 75
hclustD 3 ratio_avg_max_min 74
pam 2 ratio_avg_within_between 82
pam 2 ratio_avg_max_min 82
pam 3 ratio_avg_within_between 27
pam 3 ratio_avg_max_min 27
pam 4 ratio_avg_within_between 2
pam 4 ratio_avg_max_min 2
Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  #mutate(nclust=factor(nclust,levels=1:number_clusters)) %>% 
  group_by(method,nclust,variable) %>% 
  #summarise(avg=mean(value),stdev=sd(value)) %>% 
  #ungroup() %>% 
  #group_by(method,nclust,variable) %>% 
  summarise(Freq_Problem = sum(value==0)) %>% 
  filter(Freq_Problem>0) %>% 
  knitr::kable() %>% kableExtra::kable_styling()
method nclust variable Freq_Problem
hclustD 2 avg_between_medoids 74
hclustD 2 max_min_dist_clust_i_other 75
hclustD 2 avg_min_dist_clust_i_other 75
hclustD 3 max_min_dist_clust_i_other 74
hclustD 3 avg_min_dist_clust_i_other 74
pam 2 avg_between_medoids 82
pam 2 max_min_dist_clust_i_other 82
pam 2 avg_min_dist_clust_i_other 82
pam 3 avg_between_medoids 27
pam 3 max_min_dist_clust_i_other 27
pam 3 avg_min_dist_clust_i_other 27
pam 4 avg_between_medoids 2
pam 4 max_min_dist_clust_i_other 2
pam 4 avg_min_dist_clust_i_other 2
Code
alpha0_cv_df %>% 
  reshape2::melt(id.vars = c("nclust","method","iteration") ) %>%
  filter(nclust > 1) %>% 
  select(-iteration) %>% 
  #mutate(nclust=factor(nclust,levels=1:number_clusters)) %>% 
  group_by(method,nclust,variable) %>% 
  summarise(avg=mean(value),stdev=sd(value)) %>% 
  ungroup() %>% 
  ggplot(aes(x = nclust, y = avg, group = method,color=method,fill=method)) +
    geom_line() +
    geom_ribbon(aes(ymin=avg-stdev, ymax=avg+stdev),alpha=0.5)+
    theme_minimal(base_size = 12) +
    facet_wrap(~variable, scales = "free_y")+
  theme(legend.position = 'bottom')

Code
df_perm %>% filter(nclust>1) %>% 
  mutate(alpha_=factor(alpha)) %>% 
  ggplot(aes(
  x=nclust,y=r2,color=method,linetype=alpha_
  ))+
  geom_line()+
  theme_minimal()+
  theme(legend.position = 'bottom')